library(tidyverse)
library(janitor)
library(plotly)
You test positive for a rare disease that only effects 0.001 (One in one thousand people).
So you ask the doctor:
How certain is it that I have this disease?
What are the chances that you actually have this disease?
Some would say 99%, the accuracy of the test
\[ P(B \mid A) = \frac{P(B) L(B \mid A)}{P(A)} \]
B <- Has Disease
A <- Positive test result
P(B|A) - The probability of having the disease given a positive test result
set.seed(70) # For reproducibility
# Parameters
n_patients <- 10000 # Total population size
n_diseased <- 10 # Number of patients with the disease
sensitivity <- 0.99 # True positive rate (sensitivity)
false_positive_rate <- 0.01 # False positive rate
# Step 1: Create the DataFrame with patients
patients <- data.frame(
patient_id = 1:n_patients,
has_disease = c(rep(1, n_diseased), rep(0, n_patients - n_diseased)) # 10 with the disease, rest without
)
# Shuffle the DataFrame to randomize patient order
patients <- patients %>%
sample_frac(size = 1)
# Step 2: Simulate the test results based on disease status
patients <- patients %>%
mutate(
# Test result is positive if the person has the disease and the test is sensitive,
# or if they don't have the disease but it's a false positive
test_result = case_when(
has_disease == 1 & rbinom(n_patients, size = 1, prob = sensitivity) == 1 ~ "positive",
has_disease == 0 & rbinom(n_patients, size = 1, prob = false_positive_rate) == 1 ~ "positive",
TRUE ~ "negative"
)
)
patients %>%
tabyl(has_disease, test_result)
## has_disease negative positive
## 0 9888 102
## 1 0 10
patients %>%
tabyl(has_disease)
## has_disease n percent
## 0 9990 0.999
## 1 10 0.001
probability_disease <- 0.001
patients %>%
tabyl(has_disease, test_result) %>%
adorn_percentages("row")
## has_disease negative positive
## 0 0.9897898 0.01021021
## 1 0.0000000 1.00000000
probability_positive_result_given_disease <- 1
patients %>%
tabyl(test_result)
## test_result n percent
## negative 9888 0.9888
## positive 112 0.0112
probabilty_positive_test_result <- 0.0112
(probability_positive_result_given_disease * probability_disease) / probabilty_positive_test_result
## [1] 0.08928571
# First test posterior probability
updated_probability_having_disease <- (probability_positive_result_given_disease * probability_disease) / probabilty_positive_test_result
updated_probability_having_disease
## [1] 0.08928571
Lets think through this:
patients %>%
tabyl(test_result)
## test_result n percent
## negative 9888 0.9888
## positive 112 0.0112
This indicated that the probability of a positive test is very low (0.0112%)
But now, we need to update P(A) as the probability of a second positive test
P(A)=P(A∩B)+P(A∩Bc) = P(A|B)P(B)+P(A|Bc)P(Bc) 😨 but we got this 😎
First lets test this based on the first test result and see if we get the same 0.0112% as above
P(A|B)P(B)+P(A|Bc)P(Bc)
probability_positive_result_given_disease * probability_disease
## [1] 0.001
patients %>%
tabyl(has_disease, test_result) %>%
adorn_percentages("row")
## has_disease negative positive
## 0 0.9897898 0.01021021
## 1 0.0000000 1.00000000
patients %>%
tabyl(test_result)
## test_result n percent
## negative 9888 0.9888
## positive 112 0.0112
0.01021 * 0.9888
## [1] 0.01009565
P(A|B)P(B)+P(A|Bc)P(Bc)
0.01009565 + 0.001
## [1] 0.01109565
P(A)=P(A∩B)+P(A∩Bc) = P(A|B)P(B)+P(A|Bc)P(Bc) = 0.0111
P(A)=P(A∩B)+P(A∩Bc) = P(A|B)P(B)+P(A|Bc)P(Bc) 😨 but we got this 😎
probability_positive_result_given_disease * updated_probability_having_disease
## [1] 0.08928571
P(A|Bc)P(Bc)
patients %>%
tabyl(has_disease, test_result) %>%
adorn_percentages("row")
## has_disease negative positive
## 0 0.9897898 0.01021021
## 1 0.0000000 1.00000000
P(A|Bc) = 0.01021
1 - updated_probability_having_disease
## [1] 0.9107143
0.01021 * 0.9107143
## [1] 0.009298393
P(A|B)P(B)+P(A|Bc)P(Bc) 😨
0.08928571 + 0.009298393
## [1] 0.0985841
(updated_probability_having_disease * probability_positive_result_given_disease) / 0.0986
## [1] 0.9055346
set.seed(70) # For reproducibility
# Parameters
n_patients <- 10000 # Total population size
n_diseased <- 10 # Number of patients with the disease
sensitivity <- 0.99 # True positive rate (sensitivity)
false_positive_rate <- 0.01 # False positive rate
second_test_sensitivity <- 0.90 # Second test: 90% of positives have the disease
# Step 1: Create the DataFrame with patients
patients_updated <- data.frame(
patient_id = 1:n_patients,
has_disease = c(rep(1, n_diseased), rep(0, n_patients - n_diseased)) # 10 with the disease, rest without
)
# Shuffle the DataFrame to randomize patient order
patients_updated <- patients_updated[sample(n_patients), ]
# Step 2: Simulate the first test results based on disease status
patients_updated <- patients_updated %>%
mutate(
# First test result: positive if the person has the disease and the test is sensitive,
# or if they don't have the disease but it's a false positive
test_result = case_when(
has_disease == 1 & rbinom(n(), 1, sensitivity) == 1 ~ "positive",
has_disease == 0 & rbinom(n(), 1, false_positive_rate) == 1 ~ "positive",
TRUE ~ "negative"
)
)
# Step 3: Simulate the second test results based on the first test result
patients_updated <- patients_updated %>%
mutate(
# Second test result logic:
second_test_result = case_when(
# If they tested positive in the first test and have the disease
test_result == "positive" & has_disease == 1 ~ ifelse(rbinom(n(), 1, second_test_sensitivity) == 1, "positive", "negative"),
# If they tested positive in the first test but don't have the disease (false positive)
test_result == "positive" & has_disease == 0 ~ ifelse(rbinom(n(), 1, false_positive_rate) == 1, "positive", "negative"),
# If they tested negative in the first test, they test negative in the second
TRUE ~ "negative"
)
)
patients_updated %>%
tabyl(has_disease, second_test_result) %>%
adorn_percentages("col")
## has_disease negative positive
## 0 0.9998998999 0.1
## 1 0.0001001001 0.9
patients_updated_longer <- patients_updated %>%
pivot_longer(cols = c("test_result", "second_test_result"),
names_to = "test",
values_to = "result")
patients_updated_longer$has_disease <- as.factor(patients_updated_longer$has_disease)
stacked <- ggplot(patients_updated_longer, aes(x = result, fill = has_disease)) +
geom_bar() +
facet_wrap(~test)
ggplotly(stacked)